Setup

Install a package countrycode first.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(WDI)
library(readxl)
library(countrycode)

World Inequality Database

https://wid.world

World Inequality Report 2022

Since Excel files are binary file, you need to add mode = “wb”, web binary.

url_summary <- "https://wir2022.wid.world/www-site/uploads/2022/03/WIR2022TablesFigures-Summary.xlsx"
download.file(url = url_summary, destfile = "data/WIR2022s.xlsx", mode = "wb")
library(readxl) # readxl is a part of tidyverse but not a core package
excel_sheets("data/WIR2022s.xlsx")
##  [1] "Index"     "F1"        "F2"        "F3"        "F4"        "F5."      
##  [7] "F6"        "F7"        "F8"        "F9"        "F10"       "F11"      
## [13] "F12"       "F13"       "F14"       "F15"       "T1"        "data-F1"  
## [19] "data-F2"   "data-F3"   "data-F4"   "data-F5"   "data-F6"   "data-F7"  
## [25] "data-F8"   "data-F9"   "data-F10"  "data-F11"  "data-F12"  "data-F13."
## [31] "data-F14." "data-F15"
df_wir_f2 <- read_excel("data/WIR2022s.xlsx", 
    sheet = "data-F2")
df_wir_f2

Created a new Excel book with only one sheet, and save it as a CSV UTF-8.

df_wir_f2_2 <- read_csv("data/wir-f2.csv")
## Rows: 8 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): iso, Bottom 50%, Middle 40%, Top 10%
## dbl (1): year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_wir_f2_2
df_wir_f2_2 |> identical(df_wir_f2)
## [1] FALSE
df_wir_f2_3 <- read_delim(clipboard())
df_wir_f2_3
df_wir_f2 |> pivot_longer(3:5) |> 
  ggplot(aes(iso, value, fill = name)) + geom_col(position = "dodge") + coord_flip()

See: https://ds-sl.github.io/data-analysis/wir2022.nb.html

To share data with someone else.

dput(df_wir_f2)
## structure(list(year = c(2021, 2021, 2021, 2021, 2021, 2021, 2021, 
## 2021), iso = c("Europe", "East Asia", "North America", "Russia & Central Asia", 
## "South & South East Asia", "Latin America", "Sub-Saharan Africa", 
## "MENA"), `Bottom 50%` = c(0.1891, 0.1391, 0.1322, 0.1466, 0.1233, 
## 0.1016, 0.0892, 0.09), `Middle 40%` = c(0.4531, 0.4268, 0.4105, 
## 0.3863, 0.3283, 0.3445, 0.3537, 0.3288), `Top 10%` = c(0.3578, 
## 0.4341, 0.4573, 0.4671, 0.5484, 0.5539, 0.5571, 0.5812)), class = c("tbl_df", 
## "tbl", "data.frame"), row.names = c(NA, -8L))
df_wir_dput <- dput(df_wir_f2)
## structure(list(year = c(2021, 2021, 2021, 2021, 2021, 2021, 2021, 
## 2021), iso = c("Europe", "East Asia", "North America", "Russia & Central Asia", 
## "South & South East Asia", "Latin America", "Sub-Saharan Africa", 
## "MENA"), `Bottom 50%` = c(0.1891, 0.1391, 0.1322, 0.1466, 0.1233, 
## 0.1016, 0.0892, 0.09), `Middle 40%` = c(0.4531, 0.4268, 0.4105, 
## 0.3863, 0.3283, 0.3445, 0.3537, 0.3288), `Top 10%` = c(0.3578, 
## 0.4341, 0.4573, 0.4671, 0.5484, 0.5539, 0.5571, 0.5812)), class = c("tbl_df", 
## "tbl", "data.frame"), row.names = c(NA, -8L))
df_wir_dput
df_wir_f2_3 <- df_wir_dput
df_wir_f2_3 |> identical(df_wir_f2)
## [1] TRUE

UNdata - a world of information

https://data.un.org/

Popular statistical tables

Explorer - datamarts: http://data.un.org/Explorer.aspx

UN Migrants Data

url_un_migrants <- "https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv"

Since the url is given, a CSV file can be read directly using read_csv.

df_un_migrants <- read_csv(url_un_migrants, skip = 1)
## New names:
## Rows: 7238 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): ...2, Series, Footnotes, Source dbl (2): Region/Country/Area, Year num
## (1): Value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
df_un_migrants
str(df_un_migrants)
## spc_tbl_ [7,238 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Region/Country/Area: num [1:7238] 1 1 1 1 1 1 1 1 1 1 ...
##  $ ...2               : chr [1:7238] "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" ...
##  $ Year               : num [1:7238] 2005 2005 2005 2005 2010 ...
##  $ Series             : chr [1:7238] "International migrant stock: Both sexes (number)" "International migrant stock: Both sexes (% total population)" "International migrant stock: Male (% total Population)" "International migrant stock: Female (% total Population)" ...
##  $ Value              : num [1:7238] 1.91e+08 2.90 3.00 2.90 2.21e+08 ...
##  $ Footnotes          : chr [1:7238] NA NA NA NA ...
##  $ Source             : chr [1:7238] "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Region/Country/Area` = col_double(),
##   ..   ...2 = col_character(),
##   ..   Year = col_double(),
##   ..   Series = col_character(),
##   ..   Value = col_number(),
##   ..   Footnotes = col_character(),
##   ..   Source = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
df_un_migrants |> summary()
##  Region/Country/Area     ...2                Year         Series         
##  Min.   :  1.0       Length:7238        Min.   :2005   Length:7238       
##  1st Qu.:178.0       Class :character   1st Qu.:2010   Class :character  
##  Median :404.0       Mode  :character   Median :2015   Mode  :character  
##  Mean   :407.1                          Mean   :2014                     
##  3rd Qu.:630.0                          3rd Qu.:2020                     
##  Max.   :894.0                          Max.   :2022                     
##      Value            Footnotes            Source         
##  Min.   :        0   Length:7238        Length:7238       
##  1st Qu.:        5   Class :character   Class :character  
##  Median :      154   Mode  :character   Mode  :character  
##  Mean   :   731059                                        
##  3rd Qu.:    31096                                        
##  Max.   :280598105
df_un_migrants |> select(Year,Series) |> lapply(unique)
## $Year
##  [1] 2005 2010 2015 2020 2022 2019 2018 2016 2021 2017
## 
## $Series
## [1] "International migrant stock: Both sexes (number)"             
## [2] "International migrant stock: Both sexes (% total population)" 
## [3] "International migrant stock: Male (% total Population)"       
## [4] "International migrant stock: Female (% total Population)"     
## [5] "Total refugees and people in refugee-like situations (number)"
## [6] "Asylum seekers, including pending cases (number)"             
## [7] "Other of concern to UNHCR (number)"                           
## [8] "Total population of concern to UNHCR (number)"
df_un_migrants |> distinct(`Region/Country/Area`, ...2)
library(countrycode)
df_un_migrants %>% mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
## Caused by warning:
## ! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants |> mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2) |> filter(is.na(iso2c_un)) |> distinct(...2)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
## Caused by warning:
## ! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
wdicache <- read_rds("data/wdicache.rds")
wdi_country_extra <- wdicache$country |> select(iso2c, region, income, lending)
df_un_migrants_ext <- df_un_migrants %>% 
  mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2) |>
  drop_na(iso2c_un) |> 
  select(Country = ...2, ISO2C = iso2c_un, Year, Series, Value, Footnotes) |>
  left_join(wdi_country_extra, by = c("ISO2C" = "iso2c"))
df_un_migrants_ext
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
## Caused by warning:
## ! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants_ext$Series |> unique()
## [1] "International migrant stock: Both sexes (number)"             
## [2] "International migrant stock: Both sexes (% total population)" 
## [3] "International migrant stock: Male (% total Population)"       
## [4] "International migrant stock: Female (% total Population)"     
## [5] "Total refugees and people in refugee-like situations (number)"
## [6] "Asylum seekers, including pending cases (number)"             
## [7] "Other of concern to UNHCR (number)"                           
## [8] "Total population of concern to UNHCR (number)"
df_un_migrants_ext_rev <- df_un_migrants_ext |> mutate(Ser = case_when(
  Series == "International migrant stock: Both sexes (number)" ~ "migrant",
  Series == "International migrant stock: Both sexes (% total population)" ~ "migrant_percent",
  Series == "International migrant stock: Male (% total Population)" ~ "migrant_male",
  Series == "International migrant stock: Female (% total Population)" ~ "migrant_female",
  Series == "Total refugees and people in refugee-like situations (number)" ~ "refugee",
  Series == "Asylum seekers, including pending cases (number)" ~ "asylum",
  Series == "Other of concern to UNHCR (number)"  ~ "other",
  Series == "Total population of concern to UNHCR (number)" ~ "concern",
  TRUE ~ Series), .before = Series)
df_un_migrants_ext_rev
dput(df_un_migrants_ext_rev)

Explorer - datamarts

Environment Statistics Database>Water>Internal flow [Link]

Select columns to add Table ID and Country and Area Code of UN, and download ‘Comma’ a CSV.

df_un_water <- read_csv("data/UNdata_Export_20240124_010609671.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 1604 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country or Area Code, Country or Area, Unit
## dbl (4): Table ID, Year, Value, Value Footnotes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_water

OECD data

OECD data top

OECD data https://data.oecd.org/

  • Browse by Topics (Choose from 12 topics) or Country (Choose from 37 countries)
  • Topics:
    • Agriculture,
    • Development,
    • Economy,
    • Education,
    • Energy,
    • Environment,
    • Finance,
    • Government,
    • Health,
    • Innovation and Technology,
    • Jobs,
    • Society
      • Demography
      • Inequality
      • Migration
      • Population by Region
      • Social protection
  • Countries:
    • Australia, Austria, Belgium, Brazil, Canada, Chili, People’s Republic of China, Columbia, Costa Rica, Czechia, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, India, Indonesia, Ireland, Islael, Italy, Japan, Korea, Latvia, Lithuania, Luxembourg, Mexico, Netherlands, New Zealand, Norway, Poland, Portugal, Russian Federation, Slovak Republic, Slovenia, South Africa, Spain, Sweden, Swizerland, Türkiye, United Kingdom, United States

Database Access: https://data-explorer.oecd.org/

There is a newly developed Database Access linked above. However, it is still under development and difficult to handle data there.

Topic: Society - Migration

  • Permanent immigrant inflows
    • Permanent immigrant inflows cover regulated movements of foreigners considered to be settling in the country from the perspective of the destination country. They cover regulated movements of foreigners as well as free movement migration. The data presented are the result of a standardisation process that allows for cross-country comparisons. This indicator is measured by numbers of permanent inflows.
  • Stocks of foreign-born population in OECD countries
  • Foreign-born population
  • Foreign population
  • Native-born employment
  • Foreign-born employment
  • Native-born unemployment
  • Foreign-born unemployment
  • Native-born participation rates
  • Foreign-born participation rates

Permanent immigrant inflows

Permanent immigrant inflows Total, Number, 2022 Link

Definition of Permanent immigrant inflows

  • Permanent immigrant inflows cover regulated movements of foreigners considered to be settling in the country from the perspective of the destination country. They cover regulated movements of foreigners as well as free movement migration. The data presented are the result of a standardisation process that allows for cross-country comparisons. This indicator is measured by numbers of permanent inflows.
  • Citation: OECD (2024), Permanent immigrant inflows (indicator). doi: 10.1787/304546b6-en (Accessed on 27 January 2024)
library(htmltools)

iframe_code <- '<iframe src="https://data.oecd.org/chart/7kgk" width="860" height="645" style="border: 0" mozallowfullscreen="true" webkitallowfullscreen="true" allowfullscreen="true"><a href="https://data.oecd.org/chart/7kgk" target="_blank">OECD Chart: Permanent immigrant inflows, Total, Number, Annual, 2022</a></iframe>'

HTML(iframe_code)

Expenditure on educational institutions per full-time equivalent student

[Link]

Table in Excel, Filtered data in tableau text (CSV), Unfiltered data in tableau text (CSV) 136.4MB

temp <- read_csv("../../../bigdata/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0+all.csv")
temp
library(readxl)
df_oecd_ed <- read_excel("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0,filtered,2024-01-24 10-22-31.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
df_oecd_ed
df_oecd_ed <- read_excel("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0,filtered,2024-01-24 10-22-31.xlsx", skip = 6)
## New names:
## • `Education level` -> `Education level...1`
## • `Education level` -> `Education level...2`
## • `` -> `...13`
df_oecd_ed
colnames(df_oecd_ed) 
##  [1] "Education level...1"                                      
##  [2] "Education level...2"                                      
##  [3] "Primary to tertiary education"                            
##  [4] "Primary education"                                        
##  [5] "Lower secondary education"                                
##  [6] "Upper secondary education"                                
##  [7] "· \nUpper secondary general education"                    
##  [8] "· \nUpper secondary vocational education"                 
##  [9] "Post-secondary non-tertiary education"                    
## [10] "Tertiary education"                                       
## [11] "· \nShort-cycle tertiary education"                       
## [12] "· \nBachelor's, Master's and Doctoral or equivalent level"
## [13] "...13"
df_oecd_ed_short <- df_oecd_ed |> select(1,3,4,5,6,9,10) |> drop_na(`Primary education`)
df_oecd_ed_short
df_oecd_ed2 <- read_csv("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0+..ISCED11_1+ISCED11_2+ISCED11_3+ISCED11_4+ISCED11_5+ISCED11_5T8+ISCED11_6T8+ISCED11_1T8._T.INST_EDU.DIR_EXP...csv")
## New names:
## Rows: 816 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (12): STRUCTURE, STRUCTURE_ID, ACTION, MEASURE, REF_AREA, EDUCATION_LEV,... dbl
## (4): TIME_PERIOD, OBS_VALUE, UNIT_MULT, DECIMALS lgl (16): STRUCTURE_NAME,
## ...6, ...8, ...10, ...12, ...14, ...16, ...18, ......
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...6`
## • `` -> `...8`
## • `` -> `...10`
## • `` -> `...12`
## • `` -> `...14`
## • `` -> `...16`
## • `` -> `...18`
## • `` -> `...20`
## • `` -> `...22`
## • `` -> `...24`
## • `` -> `...26`
## • `` -> `...28`
## • `` -> `...30`
## • `` -> `...32`
df_oecd_ed2

References